In [1]:
from __future__ import print_function
import sys
import os
import pandas as pd
import csv
import seaborn as sns
import time
import matplotlib.pyplot as plt
import alphastats
import warnings
import numpy as np
warnings.filterwarnings("ignore", category=UserWarning, module="openpyxl") # remove warning from mac
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook" 
from IPython.display import display, HTML, Markdown
display(HTML("<style>:root { --jp-notebook-max-width: 80% !important; }</style>"))
from sklearn.utils._openmp_helpers import _openmp_effective_n_threads, _openmp_parallelism_enabled
_openmp_effective_n_threads(6)

plt.rcParams['pdf.fonttype'] = 42 

os.environ.setdefault("TF_NUM_THREADS", "10")
os.environ.setdefault("TF_LOOP_PARALLEL_ITERATIONS", "10")
/Users/magalhae/miniforge3/envs/alphastats/lib/python3.10/site-packages/outdated/utils.py:14: OutdatedPackageWarning: The package pingouin is out of date. Your version is 0.5.3, the latest is 0.5.5.
Set the environment variable OUTDATED_IGNORE=1 to disable these warnings.
  return warn(
Out[1]:
'10'
In [2]:
import plotly

colorway=["#B80000", "#E81E63", "#8D239E", "#673AB7", "#3F51B5", "#253985", "#02A8F4", "#009688", "#8BC34A", "#FF9800", "#FE5622", "#795648", "#9E9E9E", "#617D8B",]

plotly.io.templates["alphastats_colors"] = plotly.graph_objects.layout.Template(
    layout=plotly.graph_objects.Layout(
        paper_bgcolor="rgba(0,0,0,0)",
        plot_bgcolor="rgba(0,0,0,0)",
        colorway=colorway
        ,
    )
)
plotly.io.templates.default = "simple_white+alphastats_colors"

display(Markdown('<br>'.join(
    f'<span style="font-family: monospace">{color} <span style="color: {color}">████████</span></span>'
    for color in colorway
)))
len(colorway)

#B80000 ████████
#E81E63 ████████
#8D239E ████████
#673AB7 ████████
#3F51B5 ████████
#253985 ████████
#02A8F4 ████████
#009688 ████████
#8BC34A ████████
#FF9800 ████████
#FE5622 ████████
#795648 ████████
#9E9E9E ████████
#617D8B ████████

Out[2]:
14
In [3]:
Andersen2005 = pd.read_csv('/Users/magalhae/Desktop/FANCY_proteome/NucleolusList/Andersen2005_idmapping_active_true_2024_08_28.tsv',low_memory=False, sep = '\t')
SubCellBarcode = pd.read_csv('/Users/magalhae/Desktop/FANCY_proteome/NucleolusList/SubCell_markerProteins_annotated.csv',low_memory=False)
Stenstrom2020 = pd.read_excel('/Users/magalhae/Desktop/FANCY_proteome/NucleolusList/msb209469-sup-0002-datasetev1.xlsx', sheet_name=1)
Stenstrom2020 = Stenstrom2020[~Stenstrom2020['Uniprot ID'].isna()]
Andersen2005.info()
SubCellBarcode.info()
Stenstrom2020.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 810 entries, 0 to 809
Data columns (total 11 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   From                                810 non-null    object
 1   Entry                               810 non-null    object
 2   Reviewed                            810 non-null    object
 3   Entry Name                          810 non-null    object
 4   Protein names                       810 non-null    object
 5   Gene Names                          796 non-null    object
 6   GeneID                              787 non-null    object
 7   Ensembl                             765 non-null    object
 8   Gene Ontology (molecular function)  781 non-null    object
 9   Gene Ontology (biological process)  777 non-null    object
 10  Gene Ontology (cellular component)  800 non-null    object
dtypes: object(11)
memory usage: 69.7+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3365 entries, 0 to 3364
Data columns (total 17 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   ProteinID                           3365 non-null   object 
 1   Compartments                        3365 non-null   object 
 2   Cyto                                3365 non-null   float64
 3   Nsol                                3365 non-null   float64
 4   NucI                                3365 non-null   float64
 5   Horg                                3365 non-null   float64
 6   Lorg                                3365 non-null   float64
 7   Colour                              3365 non-null   object 
 8   ProteinsName_Subcell                3365 non-null   object 
 9   Entry Name                          3364 non-null   object 
 10  Protein names                       3364 non-null   object 
 11  Gene Names                          3364 non-null   object 
 12  GeneID                              3364 non-null   object 
 13  Ensembl                             3364 non-null   object 
 14  Gene Ontology (molecular function)  3364 non-null   object 
 15  Gene Ontology (biological process)  3364 non-null   object 
 16  Gene Ontology (cellular component)  3364 non-null   object 
dtypes: float64(5), object(12)
memory usage: 447.0+ KB
<class 'pandas.core.frame.DataFrame'>
Index: 1294 entries, 0 to 1317
Data columns (total 7 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   ENSG ID                        1294 non-null   object
 1   Gene name                      1294 non-null   object
 2   Uniprot ID                     1294 non-null   object
 3   Antibody ID                    1294 non-null   object
 4   IF gene location score         1294 non-null   object
 5   Nucleolar location             1294 non-null   object
 6   Confirmed in previous studies  308 non-null    object
dtypes: object(7)
memory usage: 80.9+ KB
In [4]:
ProteinGroups = pd.read_csv('/Users/magalhae/Desktop/FANCY_proteome/FANCY_NPM1_20241206/txt/proteinGroups.txt',low_memory=False, sep = '\t')
# ProteinGroups = ProteinGroups.set_index('Protein IDs')
ProteinGroups.info()
ProteinGroups = ProteinGroups[~ProteinGroups['Protein IDs'].str.contains("CON_")]
ProteinGroups = ProteinGroups[~ProteinGroups['Protein IDs'].str.contains("REV__")]
ProteinGroups.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4002 entries, 0 to 4001
Columns: 474 entries, Protein IDs to Taxonomy names
dtypes: float64(202), int64(201), object(71)
memory usage: 14.5+ MB
<class 'pandas.core.frame.DataFrame'>
Index: 3813 entries, 0 to 3945
Columns: 474 entries, Protein IDs to Taxonomy names
dtypes: float64(202), int64(201), object(71)
memory usage: 13.8+ MB
In [5]:
maxquant_data = alphastats.MaxQuantLoader(ProteinGroups)
# maxquant_data = alphastats.MaxQuantLoader(file="proteinGroups.txt")
ds = alphastats.DataSet(
    loader = maxquant_data, 
    metadata_path="Metadata.csv", 
    sample_column="sample"
)
print(f"Number of samples in the matrix: {ds.mat.shape[0]}, number of samples in metadata: {ds.metadata.shape[0]}.")
WARNING:root:['#Parental-input-4', '#ActD-input-4', '#KSx2-FANCI-4', '#noKS-input-4', '#F_G-FANCI-4', '#KSx2-input-4', '#noKS-FANCI-4', '#Parental-FANCI-4', '#ActD-FANCI-4', '#KSx2-input-1', '#KSx1-FANCI-4', '#KSx2-FANCI-1', '#F_G-input-4', '#KSx1-input-4'] are not described in the protein data andare removed from the metadata.
DataSet has been created.
Attributes of the DataSet can be accessed using: 
DataSet.rawinput:	 Raw Protein data.
DataSet.mat:		Processed data matrix with ProteinIDs/ProteinGroups as columns and samples as rows. All computations are performed on this matrix.
DataSet.metadata:	Metadata for the samples in the matrix. Metadata will be matched with DataSet.mat when needed (for instance Volcano Plot).
Number of samples in the matrix: 48, number of samples in metadata: 34.
In [6]:
ds.mat.replace(0, np.nan, inplace=True)
In [7]:
#ds.preprocess(subset = True, log2_transform =False,remove_contaminations=True)
ds.preprocess(subset = True, log2_transform =True)
Data has been log2-transformed.
In [8]:
plot = ds.plot_sampledistribution(method = "box", color = "group", log_scale = True)
plot.show(renderer = "svg", width=600, height=400)
plot.write_image("sampledistribution.pdf")
No description has been provided for this image
In [9]:
ds.preprocess_print_info()
                                                    0  \
0                   Raw data number of Protein Groups   
1          Matrix: Number of ProteinIDs/ProteinGroups   
2                           Matrix: Number of samples   
3                         Intensity used for analysis   
4                                    Log2-transformed   
5                                       Normalization   
6                                          Imputation   
7                    Contaminations have been removed   
8                               Contamination columns   
9   Number of removed ProteinGroups due to contami...   
10                          Data completeness cut-off   

                                                    1  
0                                                2963  
1                                                2963  
2                                                  34  
3                              LFQ intensity [sample]  
4                                                True  
5                                                None  
6                                                None  
7                                               False  
8   [Only identified by site, Reverse, Potential c...  
9                                                   0  
10                                                  0  
In [10]:
plot = ds.plot_samplehistograms()
plot.show(renderer = "svg", width=1000, height=1000)
plot.write_image("samplehistograms.pdf")
No description has been provided for this image
In [11]:
plot = ds.plot_correlation_matrix(method = "pearson")
plot.show(renderer = "svg", width=1200, height=1200)
plot.write_image("cluster_before_Norm.pdf")
No description has been provided for this image
In [ ]:
 
In [6]:
start = time.time()
ds.reset_preprocessing()
#ds.mat.replace(0, np.nan, inplace=True)
ds.preprocess(
    subset = True,
    remove_contaminations=True,
    log2_transform =False,
    normalization = 'vst',
    imputation = None
)
from numpy import inf
ds.mat[ds.mat == -inf] = 0
end = time.time()

print(end - start)
All preprocessing steps are reset.
0.2516012191772461
In [118]:
import pickle 
with open('NucleolarProt_quantile_20241209.pickle', 'wb') as output:
    pickle.dump(ds, output)
In [119]:
plot = ds.plot_sampledistribution(method = "box", color = "group", log_scale = False)
plot.show(renderer = "svg", width=600, height=400)
plot.write_image("sample_Dist_norm.pdf")
No description has been provided for this image
In [120]:
plot = ds.plot_correlation_matrix(method = "pearson")
plot.update_layout(font=dict(size=8))
plot.show(renderer = "svg", width=600, height=600)
plot.write_image("correlation_matrix_after_svd_quantile.pdf")
No description has been provided for this image
In [8]:
plot = ds.plot_pca(group = "group", circle = False)
plot.show(renderer = "svg", width=500, height=400, equal_axes=True)
plot.write_image("PCA_norm_all.pdf", width=500, height=400)
No description has been provided for this image
In [13]:
start = time.time()
ds.reset_preprocessing()
ds.rawmat = ds.rawmat + 1
#ds.mat.replace(0, np.nan, inplace=True)
ds.preprocess(
    subset = True,
    remove_contaminations=True,
    log2_transform =True,
    normalization = None,
    imputation = None
)
from numpy import inf
ds.mat[ds.mat == -inf] = 0
end = time.time()

print(end - start)
All preprocessing steps are reset.
Data has been log2-transformed.
0.053072214126586914
In [12]:
import plotly
from alphastats.plots.PlotUtils import PlotUtils, plotly_object
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import matplotlib.pyplot as plt
from functools import reduce

import scipy
from scipy.spatial import distance
from scipy.cluster import hierarchy
import sklearn
from sklearn.metrics import silhouette_score
from sklearn import cluster, datasets, mixture
from sklearn.neighbors import kneighbors_graph

np.random.seed(12345)
In [124]:
def element_exists(lst, element):
  # Try to get the index of the element in the list
  try:
    lst.index(element)
  # If the element is found, return True
    return True
  # If a ValueError is raised, the element is not in the list
  except ValueError:
  # Return False in this case
    return False
In [125]:
# anova on ds to remove genes that dont change
anova_df = ds.anova(column="group", tukey=False)
# saev anova results
anova_df.to_csv('anova_df_Tukey_alphastats_processed_quantile_svd.csv', index=True)

# reset the index and use
anova_df = anova_df.set_index('Protein IDs')

# anova_df =  pd.read_csv('anova_df_Tukey_alphastats_processed_quantile_svd.csv',low_memory=False)
# significant_proteins = anova_df[anova_df["Protein IDs"]][ds.index_column].to_list()
In [126]:
# anova_df =  pd.read_csv('./Heatmap/anova_df_Tukey_alphastats_processed_quantile_svd.csv',low_memory=False).set_index('Protein IDs')
significant_proteins = anova_df[anova_df["ANOVA_pvalue"] < 0.05].index.to_list()
significant_proteins.append("mCherry") if "mCherry" not in significant_proteins else significant_proteins
significant_proteins_df = pd.DataFrame(significant_proteins, columns=["Protein IDs"]).to_csv('significat_proteins_anova.csv', index=False)
In [127]:
len(significant_proteins)
Out[127]:
2369
In [14]:
df = ds.mat
df = pd.DataFrame(df).T
# set the desired order of groups
# groups = [ "Parental-input-1", "Parental-input-2", "Parental-input-3", "Parental-input-4",
#           "KSx1-input-1", "KSx1-input-2", "KSx1-input-3", "KSx1-input-4",
#           "KSx2-input-1", "KSx2-input-2", "KSx2-input-3", "KSx2-input-4",
#           "noKS-input-1", "noKS-input-2", "noKS-input-3", "noKS-input-4",
#           "F_G-input-1", "F_G-input-2", "F_G-input-3", "F_G-input-4",
#           "ActD-input-1", "ActD-input-2", "ActD-input-3", "ActD-input-4",
#           "Parental-FANCI-1", "Parental-FANCI-2", "Parental-FANCI-3", "Parental-FANCI-4",
#           "KSx1-FANCI-1", "KSx1-FANCI-2", "KSx1-FANCI-3", "KSx1-FANCI-4",
#           "KSx2-FANCI-1", "KSx2-FANCI-2", "KSx2-FANCI-3", "KSx2-FANCI-4",
#           "noKS-FANCI-1", "noKS-FANCI-2", "noKS-FANCI-3", "noKS-FANCI-4",
#           "F_G-FANCI-1", "F_G-FANCI-2", "F_G-FANCI-3", "F_G-FANCI-4", "ActD-FANCI-1",
#           "ActD-FANCI-2", "ActD-FANCI-3", "ActD-FANCI-4"
#           ]
groups = ["Parental-input-1", "Parental-input-2", "Parental-input-3",
          "noKS-input-1", "noKS-input-2", "noKS-input-3",
          "KSx1-input-1","KSx1-input-2", "KSx1-input-3",
          "KSx2-input-2", "KSx2-input-3",
          "F_G-input-1", "F_G-input-2", "F_G-input-3",

          "Parental-FANCI-1", "Parental-FANCI-2", "Parental-FANCI-3",
          "noKS-FANCI-1", "noKS-FANCI-2", "noKS-FANCI-3",
          "KSx1-FANCI-1","KSx1-FANCI-2", "KSx1-FANCI-3",
          "KSx2-FANCI-2", "KSx2-FANCI-3",
          "F_G-FANCI-1", "F_G-FANCI-2", "F_G-FANCI-3",
          
         ]



# reorder indexes/indices besed on the desired order
new_order = []
for group in groups:
    indexes = df.columns.str.startswith(group)
    indexes_locs = np.where(indexes)[0].tolist()
    new_order += indexes_locs

## reorder df based on the new order\
ordered_df = df.iloc[:, new_order]

#ordered_df = ordered_df.loc[ordered_df.index.isin(significant_proteins)]
In [15]:
print(df.shape)
print(ordered_df.shape)
(2963, 34)
(2963, 28)
In [16]:
ordered_df = ordered_df.reset_index()
ordered_df['Andersen2005'] = ordered_df['Protein IDs'].apply(lambda x: any(i in x for i in Andersen2005['Entry']))
ordered_df['SubCellBarcode'] = ordered_df['Protein IDs'].apply(lambda x: any(i in x for i in SubCellBarcode[SubCellBarcode['Compartments']=="N3"]['ProteinID']))
ordered_df['SubCellBarcode_N'] = ordered_df['Protein IDs'].apply(lambda x: any(i in x for i in SubCellBarcode[SubCellBarcode['Compartments'].isin(['N1','N2','N3', 'N4'])]['ProteinID']))
ordered_df['Stenstrom2020'] = ordered_df['Protein IDs'].apply(lambda x: any(i in x for i in set(Stenstrom2020['Uniprot ID'])))
ordered_df = ordered_df.set_index('Protein IDs')
In [17]:
ordered_df
Out[17]:
Parental-input-1 Parental-input-2 Parental-input-3 noKS-input-1 noKS-input-2 noKS-input-3 KSx1-input-1 KSx1-input-2 KSx1-input-3 KSx2-input-2 ... KSx1-FANCI-3 KSx2-FANCI-2 KSx2-FANCI-3 F_G-FANCI-1 F_G-FANCI-2 F_G-FANCI-3 Andersen2005 SubCellBarcode SubCellBarcode_N Stenstrom2020
Protein IDs
Q9Y2S6;A0A024R1R8 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 15.058499 0.000000 15.134827 0.000000 14.995988 15.129726 True False False True
P0DPI2;A0A0B4J2D5;P0DPI2-2;A0A0B4J2D5-2 16.127007 16.171724 16.302389 17.160541 17.135208 16.989483 16.418396 17.145992 17.123232 16.869305 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 False False False False
A0AVT1;A0AVT1-2;A0AVT1-4;A0AVT1-3 0.000000 0.000000 0.000000 16.214755 0.000000 16.225829 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 False False False False
A0FGR8;A0FGR8-2;A0FGR8-6;A0FGR8-5;A0FGR8-4 0.000000 14.710430 14.500655 14.183868 14.309263 14.261728 0.000000 14.435149 14.300996 14.402879 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 False False False False
A0MZ66;A0MZ66-3;A0MZ66-4;A0MZ66-5;A0MZ66-6;A0MZ66-8;A0MZ66-2;A0MZ66-7 15.986131 16.281568 16.316688 16.525429 16.613818 16.518899 16.191657 16.625281 16.599680 16.617840 ... 0.000000 13.876037 13.981032 13.482682 0.000000 0.000000 False False False False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Q9Y6Q9;Q9Y6Q9-5;Q9Y6Q9-4;Q9Y6Q9-2;Q9Y6Q9-3 0.000000 0.000000 0.000000 0.000000 0.000000 13.752799 0.000000 0.000000 13.723554 13.733121 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 False False False False
Q9Y6V7 0.000000 14.943431 14.974639 14.707413 14.810270 14.855744 0.000000 14.709784 14.384042 14.685022 ... 18.080047 17.515068 17.818158 17.949576 18.061023 17.900242 False False True True
Q9Y6W5;Q9Y6W5-2;Q9UPY6;Q9UPY6-2 0.000000 14.702064 14.746357 14.566470 15.043924 14.765182 14.376193 14.627192 14.967632 14.954469 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 False False False False
Q9Y6X9;Q9Y6X9-2 0.000000 0.000000 0.000000 0.000000 0.000000 13.841761 0.000000 0.000000 0.000000 0.000000 ... 14.614422 14.247037 0.000000 14.704012 0.000000 14.596015 False False False False
Q9Y6Y8;Q9Y6Y8-2 15.779232 15.774195 15.713601 16.263434 16.335826 16.171314 15.293005 16.320236 16.450712 16.225207 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 False False False False

2963 rows × 32 columns

In [18]:
print(ordered_df.Andersen2005.value_counts())
print(ordered_df.SubCellBarcode.value_counts())
print(ordered_df.Stenstrom2020.value_counts())
print(ordered_df.SubCellBarcode_N.value_counts())
Andersen2005
False    2370
True      593
Name: count, dtype: int64
SubCellBarcode
False    2793
True      170
Name: count, dtype: int64
Stenstrom2020
False    2572
True      391
Name: count, dtype: int64
SubCellBarcode_N
False    2470
True      493
Name: count, dtype: int64
In [19]:
def search(regex: str, df, case=False):
    """Search all the text columns of `df`, return rows with any matches."""
    textlikes = df.select_dtypes(include=[object, "string"])
    return df[
        textlikes.apply(
            lambda column: column.str.contains(regex, regex=True, case=case, na=False)
        ).any(axis=1)
    ]
In [20]:
Anderson2005_df = ordered_df.pop("Andersen2005")
SubCellBarcode_df = ordered_df.pop("SubCellBarcode")
SubCellBarcode_N_df = ordered_df.pop("SubCellBarcode_N")
Stenstrom2020_df = ordered_df.pop("Stenstrom2020")
In [136]:
ordered_df
Out[136]:
Parental-input-1 Parental-input-2 Parental-input-3 Parental-input-4 noKS-input-1 noKS-input-2 noKS-input-3 noKS-input-4 KSx1-input-1 KSx1-input-2 ... noKS-FANCI-4 KSx1-FANCI-1 KSx1-FANCI-2 KSx1-FANCI-3 KSx2-FANCI-2 KSx2-FANCI-3 F_G-FANCI-1 F_G-FANCI-2 F_G-FANCI-3 F_G-FANCI-4
Protein IDs
Q9Y2S6;A0A024R1R8 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 16.362680 0.000000 14.689507 15.058499 0.000000 15.134827 0.000000 14.995988 15.129726 15.115166
P0DPI2;A0A0B4J2D5;P0DPI2-2;A0A0B4J2D5-2 16.127007 16.171724 16.302389 16.043220 17.160541 17.135208 16.989483 17.278667 16.418396 17.145992 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
A0AVT1;A0AVT1-2;A0AVT1-4;A0AVT1-3 0.000000 0.000000 0.000000 0.000000 16.214755 0.000000 16.225829 16.499581 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
A0FGR8;A0FGR8-2;A0FGR8-6;A0FGR8-5;A0FGR8-4 0.000000 14.710430 14.500655 14.459175 14.183868 14.309263 14.261728 14.143144 0.000000 14.435149 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
A0MZ66;A0MZ66-3;A0MZ66-4;A0MZ66-5;A0MZ66-6;A0MZ66-8;A0MZ66-2;A0MZ66-7 15.986131 16.281568 16.316688 16.493652 16.525429 16.613818 16.518899 16.699139 16.191657 16.625281 ... 12.764021 0.000000 0.000000 0.000000 13.876037 13.981032 13.482682 0.000000 0.000000 0.000000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Q9Y6Q9;Q9Y6Q9-5;Q9Y6Q9-4;Q9Y6Q9-2;Q9Y6Q9-3 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 13.752799 13.800091 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
Q9Y6V7 0.000000 14.943431 14.974639 14.900206 14.707413 14.810270 14.855744 15.003474 0.000000 14.709784 ... 17.388101 17.867315 17.996893 18.080047 17.515068 17.818158 17.949576 18.061023 17.900242 17.826685
Q9Y6W5;Q9Y6W5-2;Q9UPY6;Q9UPY6-2 0.000000 14.702064 14.746357 14.971364 14.566470 15.043924 14.765182 15.023235 14.376193 14.627192 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
Q9Y6X9;Q9Y6X9-2 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 13.841761 14.197985 0.000000 0.000000 ... 13.827640 0.000000 0.000000 14.614422 14.247037 0.000000 14.704012 0.000000 14.596015 0.000000
Q9Y6Y8;Q9Y6Y8-2 15.779232 15.774195 15.713601 15.174184 16.263434 16.335826 16.171314 16.080672 15.293005 16.320236 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

2963 rows × 34 columns

In [21]:
print(np.min(ordered_df))
print(np.max(ordered_df))
print(np.quantile(ordered_df, [0.1, 0.5, 0.95]))
0.0
24.018522134211906
[ 0.          0.         18.56395396]
In [138]:
SubCellBarcode_df.unique()
Out[138]:
array([False,  True])
In [25]:
### Display some proteins

sns.set(font_scale=0.6)

lut1 = dict(zip(Anderson2005_df.unique(), ['#DB3D6E','#ffffff']))
row_colors1 = Anderson2005_df.map(lut1)

lut2 = dict(zip(SubCellBarcode_N_df.unique(), ['#ffffff','#1976D2']))
row_colors2 = SubCellBarcode_N_df.map(lut2)

lut3 = dict(zip(SubCellBarcode_df.unique(), ['#ffffff','#00BFC4']))
row_colors3 = SubCellBarcode_df.map(lut3)

lut4 = dict(zip(Stenstrom2020_df.unique(), ['#6200EA','#ffffff']))
row_colors4 = Stenstrom2020_df.map(lut4)

row_colors = pd.concat([row_colors1,row_colors2,row_colors3,row_colors4],axis=1)


heat_clust = sns.clustermap(ordered_df,
                            col_cluster=False,
                            yticklabels=False,
                            method='ward',
                            cmap="rocket",
                            row_colors=row_colors,
                            #cmap="coolwarm",
                            #cmap="icefire",
                            #cmap="RdPu",
                            #cmap="mako",
                            #cmap="YlGnBu",
                            #cmap=sns.diverging_palette(220, 20, as_cmap=True),
                            # cmap="RdYlBu",
                            # center=0,
                            #mask=(ordered_df==0),
                             vmin=0,
                             vmax=20,
                            cbar_kws=dict(
                                #ticks=[0,0.5, 1],
                                #ticks=[-2,0,1, 2],
                                orientation="horizontal",  
                            )
                           )

x0, _y0, _w, _h = heat_clust.cbar_pos
heat_clust.ax_cbar.set_position([x0,
                                 0.9,
                                 heat_clust.ax_row_dendrogram.get_position().width/2,
                                 0.02]
                               )
heat_clust.ax_cbar.set_title('Normalized LFQ')
heat_clust.ax_cbar.tick_params(axis='x',
                               length=5)

from matplotlib.pyplot import gcf


# #legend Anderson2005
# for label in Anderson2005_df.unique():
#     heat_clust.ax_col_dendrogram.bar(0, 0, color=lut1[label], label=label, linewidth=0)

# l1 = heat_clust.ax_col_dendrogram.legend(title='Anderson2005', loc="center", ncol=5, bbox_to_anchor=(0.9, 0.9), bbox_transform=gcf().transFigure)

# #legend SubCellBarcode_N
# for label in SubCellBarcode_N_df.unique():
#     heat_clust.ax_col_dendrogram.bar(0, 0, color=lut2[label], label=label, linewidth=0)

# l2 = heat_clust.ax_col_dendrogram.legend(title='SubCellBarcode_N', loc="center", ncol=5, bbox_to_anchor=(0.9, 0.9), bbox_transform=gcf().transFigure)

# #legend SubCellBarcode_N
# for label in SubCellBarcode_df.unique():
#     heat_clust.ax_col_dendrogram.bar(0, 0, color=lut3[label], label=label, linewidth=0)

# l3 = heat_clust.ax_col_dendrogram.legend(title='SubCellBarcode_N3', loc="center", ncol=5, bbox_to_anchor=(0.9, 0.9), bbox_transform=gcf().transFigure)


# #legend Stenstrom2020_df
# for label in Stenstrom2020_df.unique():
#     heat_clust.ax_col_dendrogram.bar(0, 0, color=lut4[label], label=label, linewidth=0)

# l4 = heat_clust.ax_col_dendrogram.legend(title='Stenstrom2020', loc="center", ncol=5, bbox_to_anchor=(0.9, 0.9), bbox_transform=gcf().transFigure)


reordered_labels = ordered_df.index[heat_clust.dendrogram_row.reordered_ind].tolist()
use_labels = []

use_labels = ['msfGFP-NPM1', 'mCherry',
              'Q6NW34', #NEPRO
              'Q9BSC4', #NOL10
              'O15446', #POLR1G
              'Q9P1U0', #POLR1H
              'P24928', #POLR2A
              'O75683', #SURF6
              'Q07020;Q07020-2' #RLP18
             ]

use_labels_correct = []
use_labels_g = []
use_labels_c = []
for labels in use_labels:
    use_labels_correct.append(search(labels, ds.rawinput, case=False)['Protein IDs'].values[0])

for labels in use_labels_correct:
    if labels in reordered_labels:
        use_labels_c.append(labels)
        use_labels_g.append(ds.rawinput.set_index('Protein IDs').loc[[labels], "Gene names"].values[0].split(";")[0])
    else: continue


use_ticks = [reordered_labels.index(label) + .5 for label in use_labels_c]

heat_clust.ax_heatmap.set(yticks=use_ticks, yticklabels=use_labels_g)

heat_clust.fig.set_size_inches((10,10))
heat_clust.savefig("Heatmap_clustered_filtered_svd_qualite_polimerase.pdf", format = 'pdf')
heat_clust.savefig("Heatmap_clustered_filtered_svd_qualite_polimerase.png", format = 'png', dpi = 300)
No description has been provided for this image
In [26]:
labels = heat_clust.ax_heatmap.yaxis.get_majorticklabels()
ordered_row_df_heat = heat_clust.data2d
row_order = heat_clust.data2d.index
In [27]:
plt.figure(figsize=(3,10))
den = scipy.cluster.hierarchy.dendrogram(heat_clust.dendrogram_row.linkage,
                                         #labels = list(heat_clust.data2d.index),
                                         orientation='left',
                                         color_threshold = 500,
                                         get_leaves = True
                                         )

plt.savefig('den_Heatmap_clustered_filtered_svd_qualite.png', format='png', bbox_inches='tight')
plt.savefig('den_Heatmap_clustered_filtered_svd_qualite.pdf', format='pdf', bbox_inches='tight')
No description has been provided for this image
In [28]:
ordered_row_df = heat_clust.data2d
#extract ivl list (order of the dendrogram), leaves_color_list (cluster)
ordered_row_df.insert(0 , "cluster", den["leaves_color_list"])
ordered_row_df.insert(1 ,'order', np.arange(len(ordered_df)))
ordered_row_df.insert(2 , "original_index", den["ivl"])
ordered_row_df.head()
Out[28]:
cluster order original_index Parental-input-1 Parental-input-2 Parental-input-3 noKS-input-1 noKS-input-2 noKS-input-3 KSx1-input-1 ... noKS-FANCI-2 noKS-FANCI-3 KSx1-FANCI-1 KSx1-FANCI-2 KSx1-FANCI-3 KSx2-FANCI-2 KSx2-FANCI-3 F_G-FANCI-1 F_G-FANCI-2 F_G-FANCI-3
Protein IDs
P16403 C1 0 614 0.0 0.000000 21.502760 20.877101 0.000000 21.907566 0.000000 ... 21.048599 21.805933 20.488856 21.283932 21.519702 20.873651 21.064540 20.876277 21.063026 21.310968
Q12788 C1 1 1392 0.0 15.664364 15.939212 15.653993 15.840680 15.698922 0.000000 ... 19.012963 18.961919 18.686473 18.903658 19.111691 18.550784 18.813531 18.766684 19.180201 18.899681
Q9NQT5;Q9NQT5-2 C1 2 2568 0.0 17.195757 16.921260 16.603539 16.772753 16.785963 0.000000 ... 18.061076 17.987486 17.975109 17.709676 17.936408 17.542533 17.700020 18.410427 17.969387 18.042333
Q6UX04;Q6UX04-2 C1 3 1810 0.0 14.662780 14.633960 14.902611 14.675681 15.051081 14.934152 ... 14.777666 15.280735 15.137431 14.876613 14.998590 14.250150 14.964612 15.724647 14.753321 15.488123
Q9NQW6;Q9NQW6-2 C1 4 2569 0.0 15.255102 15.271901 14.741888 14.937834 15.016286 14.835360 ... 14.426134 14.360298 14.934935 14.534972 14.502708 14.414091 14.624567 14.998238 14.619188 14.880253

5 rows × 31 columns

In [29]:
ordered_row_df = ordered_row_df.reset_index()
ordered_row_df['Andersen2005'] = ordered_row_df['Protein IDs'].apply(lambda x: any(i in x for i in Andersen2005['Entry']))
ordered_row_df['SubCellBarcode'] = ordered_row_df['Protein IDs'].apply(lambda x: any(i in x for i in SubCellBarcode[SubCellBarcode['Compartments']=="N3"]['ProteinID']))
ordered_row_df['SubCellBarcode_N'] = ordered_row_df['Protein IDs'].apply(lambda x: any(i in x for i in SubCellBarcode[SubCellBarcode['Compartments'].isin(['N1','N2','N3', 'N4'])]['ProteinID']))
ordered_row_df['Stenstrom2020'] = ordered_row_df['Protein IDs'].apply(lambda x: any(i in x for i in set(Stenstrom2020['Uniprot ID'])))
ordered_row_df = ordered_row_df.set_index('Protein IDs')
In [30]:
ordered_row_df = pd.merge(ordered_row_df, ProteinGroups.set_index('Protein IDs').iloc[:, 0:11], left_index=True, right_index=True)
In [31]:
heat_clust.data2d['cluster'].unique()
Out[31]:
array(['C1', 'C2', 'C3', 'C4', 'C5'], dtype=object)
In [32]:
ordered_row_df.to_csv('ordered_row_df_heatmap_wClustering_Total.csv', index=True)
In [119]:
 
No description has been provided for this image
In [ ]: